This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Dataset first preparation and libraries import
library(lme4)
library(ggplot2)
library(plotrix)
library(visreg)
library(stringr)
library(dplyr)
spr_output_data <- data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/spr_raxml_15.csv",header = TRUE, na.strings = ""))
lasso_output_data <- data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/lasso_eval.csv",header = TRUE, na.strings = ""))
data_source = data.frame(read.csv("/Users/noa/Workspace/data/sampled_datasets.csv",header = TRUE, na.strings = ""))
lasso_output_data$relative_path = str_replace_all(lasso_output_data$dataset_id,'(/groups/pupko/noaeker/data/ABC_DR/)|(/ref_msa.aa.phy)',"")
spr_output_data$relative_path = str_replace_all(spr_output_data$dataset_id,'(/groups/pupko/noaeker/data/ABC_DR/)|(/ref_msa.aa.phy)',"")
data= merge(spr_output_data,data_source,by.x="relative_path",by.y="path")
if (nrow(data)<nrow(spr_output_data))
{print("Problem in matching path names")}
data=data[data$current_starting_tree_type=="RANDOM"&data$n_seq==15 ,]
data_only_lasso= merge(lasso_output_data,data_source,by.x="relative_path",by.y="path")
if (nrow(data_only_lasso)<nrow(lasso_output_data))
{print("Problem in matching path names")
cat("nrow data=",nrow(data), "nrow lasso=",nrow(data_only_lasso))
}
#Lasso train and test example
test_df= data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/test_sitelh_df_prediction.csv",header = TRUE, na.strings = ""))
names(test_df)[2:3]=c("test_true_val","test_predicted_val")
training_df=data.frame(read.csv("/Users/noa/Workspace/lasso_positions_sampling_results/training_sitelh_df_prediction.csv",header = TRUE, na.strings = ""))
names(training_df)[2:3]=c("training_true_val","training_predicted_val")
#Adding new fields:
data$rho_second_phase = data$naive_SPR_spr_moves/(data$lasso_SPR_second_phase_spr_moves+(data$lasso_SPR_first_phase_spr_moves*data$sample_pct))
data$rho_first_phase = data$naive_SPR_spr_moves/(data$lasso_SPR_first_phase_spr_moves*data$sample_pct)
data$delta_ll_second_phase = data$naive_SPR_ll-data$lasso_SPR_second_phase_ll
data$delta_ll_first_phase = data$naive_SPR_ll-data$lasso_SPR_first_phase_ll
num_cols <- unlist(lapply(data, is.numeric))
agg_per_msa = aggregate(data[num_cols], by=list(data$dataset_id), mean,na.action = na.pass)
example_MSA_data = data%>%filter(dataset_id=="/groups/pupko/noaeker/data/ABC_DR/Selectome/Euteleostomi/ENSGT00600000084481/ref_msa.aa.phy")
Usefull functions
plot.hist<-function(x,main,xlab, title_size=13, text_size=12)
{
qplot(x,
geom="histogram",
main = main,
xlab = xlab,
fill=I("blue"),
col=I("red"),
alpha=I(.2),
#xlim=c(20,50)
) + theme(plot.title = element_text(hjust = 0.5,size=title_size)) +theme(text = element_text(size = text_size))
}
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
simple.regression<-function(x,y,main,xlab, ylab,cex=1.0,resid_plot=FALSE)
{linearMod <- lm((y) ~ x)
print(main)
print(summary(linearMod))
r.squared <- summary(linearMod)$r.squared
p.val <- lmp(linearMod)
plot(x, y,main=main,
xlab = xlab, ylab = ylab,pch = 16, col = "blue",cex.main=1,cex=cex)
abline(linearMod)
mylabel = bquote(italic(R)^2 == .(format(r.squared, digits = 3)))
text(x = 19, y = 2.5, labels = mylabel)
rp = vector('expression',2)
rp[1] = substitute(expression(italic(R)^2 == MYVALUE),
list(MYVALUE = format(r.squared,dig=4)))[2]
rp[2] = substitute(expression(italic(p.value) == MYOTHERVALUE),
list(MYOTHERVALUE = format(p.val, digits = 2)))[2]
legend('topleft', legend = rp, bty = 'n')
if (resid_plot==TRUE)
{plot(fitted(linearMod),resid(linearMod), main =main,xlab=xlab)
abline(0, 0)
}
return(c(r.squared,p.val))
}
two.groups.hist<-function(vec.a,vec.b,name.a,name.b,title,x.axis.name, title.size=22)
{
df.a <- data.frame(data = vec.a, name=name.a)
df.b <- data.frame(data = vec.b,name=name.b)
df.ab <- rbind(df.a, df.b)
ggplot(df.ab, aes(data, fill = name)) + geom_density(alpha = 0.2)+
theme(legend.title=element_blank())+labs(title=title)+xlab(x.axis.name)+
theme(plot.title = element_text(hjust = 0.5))+
theme(plot.title = element_text(size=title.size))
}
#two.groups.hist(c(1,1,12,2,2,2,2),c(1,1,12,2,2,2,2),"name.a","name.b","title","x.axis.name")
#simple.regression(c(1,2,3,4,5,6,7),c(5,7,9,12,13,14,15),"","","")
Counting the different sources
file_name_and_source_lasso=data_only_lasso[!duplicated(data_only_lasso[ , c("db","dataset_id")]),]
table(file_name_and_source_lasso$db)
PANDIT Selectome
402 462
print("Job finished only Lasso:")
[1] "Job finished only Lasso:"
print(unique(data_only_lasso$job_id[order(data_only_lasso$job_id)]))
[1] 1 3 4 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
[40] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
[79] 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
file_name_and_source=data[!duplicated(data[ , c("db","dataset_id")]),]
table(file_name_and_source$db)
Selectome
48
print("Job finished only Lasso:")
[1] "Job finished only Lasso:"
print(unique(data$job_id[order(data$job_id)]))
[1] 1 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
[40] 41 42 43 44 45 46 47 48 49
Find good datasets for Lasso example:
select(data_only_lasso%>%filter(sample_pct<0.15&n_seq>=15&n_loci>3000),dataset_id,job_id,curr_msa_version_folder, n_loci,number_loci_chosen,n_seq, ,lasso_test_R.2, avg_entropy,gap_pct,informative_columns_count)
print(select(data_only_lasso%>%filter(dataset_id=="/groups/pupko/noaeker/data/ABC_DR/Selectome/Euteleostomi/ENSGT00530000063749/ref_msa.aa.phy"),dataset_id,job_id,curr_msa_version_folder, n_loci,number_loci_chosen,n_seq, ,lasso_test_R.2, avg_entropy,gap_pct,informative_columns_count))
print(example_MSA_data)
Find good datasets for SPR example:
select(data%>%filter(sample_pct<0.05),dataset_id,job_id, n_loci,number_loci_chosen,n_seq,sample_pct)
select(data%>%filter(dataset_id=="/groups/pupko/noaeker/data/ABC_DR/Selectome/Euteleostomi/ENSGT00530000063889/ref_msa.aa.phy"))
Find datasets with low test R^2
select(data_only_lasso%>%filter(lasso_test_R.2<0.9), dataset_id,lasso_test_R.2,gap_pct,lasso_training_R.2, n_seq, n_loci, number_loci_chosen, mad, divergence,avg_entropy)
Find datasets with low pearson during tree search
low_pearson_datasets = select(data%>%filter(ll_pearson_during_tree_search==min(data$ll_pearson_during_tree_search)),dataset_id, n_loci,ll_pearson_during_tree_search,delta_ll_second_phase)
print(low_pearson_datasets)
NA
Find datasets with high delta log likelihood:
high_delta_ll_datasets_second_phase = select(data%>%filter(delta_ll_second_phase==max(data$delta_ll_second_phase)),dataset_id, mistake_cnt, n_loci, delta_ll_first_phase, delta_ll_second_phase,ll_pearson_during_tree_search)
print(high_delta_ll_datasets_second_phase)
NA
An example of the LL approximation Change the example to another one
#Specific example of Lasso:
#Make sure what is the #positions chosen by lasso here.
lasso_example_msa_data = data_only_lasso[data_only_lasso$dataset_id=="/groups/pupko/noaeker/data/ABC_DR/Selectome/Euteleostomi/ENSGT00530000063889/ref_msa.aa.phy", ]
print(select(lasso_example_msa_data,"dataset_id","job_id","curr_msa_version_folder","n_seq", "db","number_loci_chosen","n_loci","alpha","lasso_training_R.2", "lasso_test_R.2"))
training_df["training delta ll"] =training_df["training_true_val"]-training_df["training_predicted_val"]
test_df["test delta ll"] =test_df["test_true_val"]-test_df["test_predicted_val"]
print(training_df[,c("training_predicted_val","training_true_val","training delta ll")])
print(test_df[,c("test_predicted_val","test_true_val","test delta ll")])
simple.regression(training_df$training_true_val,training_df$training_predicted_val,"Training set predicted log likelihood vs true log likelihood","Training set true log likelihood","Training set predicted log likelihood")
[1] "Training set predicted log likelihood vs true log likelihood"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-143.020 -23.414 0.168 25.502 117.666
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.737e+02 3.875e+01 -4.482 8.24e-06 ***
x 9.981e-01 4.321e-04 2309.628 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 36.78 on 998 degrees of freedom
Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
F-statistic: 5.334e+06 on 1 and 998 DF, p-value: < 2.2e-16
[1] 0.9998129 0.0000000
simple.regression(test_df$test_true_val,test_df$test_predicted_val,"Test set predicted log likelihood vs true log likelihood","Test set true log likelihood","Test set predicted log likelihood",cex=1)
[1] "Test set predicted log likelihood vs true log likelihood"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-350.95 -44.11 9.20 52.60 141.26
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.329e+02 2.898e+02 -1.494 0.138
x 9.952e-01 3.238e-03 307.386 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 76.82 on 98 degrees of freedom
Multiple R-squared: 0.999, Adjusted R-squared: 0.999
F-statistic: 9.449e+04 on 1 and 98 DF, p-value: < 2.2e-16
[1] 9.989639e-01 4.575868e-148
LL approximation on xxx datasets:
print("lasso training R^2:")
[1] "lasso training R^2:"
print(summary(data_only_lasso$lasso_training_R.2))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.9980 0.9997 0.9998 0.9998 0.9999 1.0000
print("lasso test R^2:")
[1] "lasso test R^2:"
print(summary(data_only_lasso$lasso_test_R.2))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7416 0.9985 0.9993 0.9984 0.9997 1.0000
print("lasso sample pct :")
[1] "lasso sample pct :"
print(summary(data_only_lasso$sample_pct))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.01022 0.12491 0.25114 0.31900 0.43955 0.93358
plot.hist(data_only_lasso$lasso_test_R.2, main="Test R^2 of Lasso approximation", xlab="R^2")
plot.hist(data_only_lasso$sample_pct,main="Distribution of the fraction of positions chosen by Lasso", xlab="fraction of positions chosen by Lasso")
plot.hist(data_only_lasso$number_loci_chosen,main="Number of positions chosen by Lasso across MSAs", xlab="number of positions chosen by Lasso")
simple.regression(x=data_only_lasso$mad, y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. mad", xlab="mad",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. mad"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256153 0.000047 0.000867 0.001286 0.003115
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9994282 0.0009529 1048.821 <2e-16 ***
x -0.0047566 0.0040907 -1.163 0.245
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.009175 on 862 degrees of freedom
Multiple R-squared: 0.001566, Adjusted R-squared: 0.0004078
F-statistic: 1.352 on 1 and 862 DF, p-value: 0.2452
[1] 0.001566037 0.245243337
simple.regression(x=data_only_lasso$gap_pct, y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. percentage of gaps in MSA", xlab="percentage of gaps",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. percentage of gaps in MSA"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256087 -0.000049 0.000666 0.001296 0.002796
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9970321 0.0007309 1364.113 <2e-16 ***
x 0.0038935 0.0019079 2.041 0.0416 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.00916 on 862 degrees of freedom
Multiple R-squared: 0.004808, Adjusted R-squared: 0.003654
F-statistic: 4.165 on 1 and 862 DF, p-value: 0.04158
[1] 0.004808192 0.041579250
simple.regression(x=data_only_lasso$divergence, y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. divergence", xlab="divergence",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. divergence"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256679 0.000088 0.000904 0.001303 0.001740
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.982e-01 3.451e-04 2892.812 <2e-16 ***
x 6.320e-06 6.899e-06 0.916 0.36
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.009177 on 862 degrees of freedom
Multiple R-squared: 0.0009726, Adjusted R-squared: -0.0001863
F-statistic: 0.8392 on 1 and 862 DF, p-value: 0.3599
[1] 0.0009726353 0.3598738914
simple.regression(x=data_only_lasso$avg_entropy, y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. Entropy", xlab="Entropy",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. Entropy"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256471 0.000153 0.000726 0.001191 0.002443
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9992127 0.0006525 1531.471 <2e-16 ***
x -0.0010298 0.0007098 -1.451 0.147
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.009171 on 862 degrees of freedom
Multiple R-squared: 0.002436, Adjusted R-squared: 0.001279
F-statistic: 2.105 on 1 and 862 DF, p-value: 0.1472
[1] 0.002436041 0.147182615
simple.regression(x=data_only_lasso$n_loci, y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. number of positions", xlab="Number of positions",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. number of positions"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256411 -0.000012 0.000780 0.001303 0.002189
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.976e-01 5.403e-04 1846.371 <2e-16 ***
x 1.043e-06 5.689e-07 1.833 0.0671 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.009164 on 862 degrees of freedom
Multiple R-squared: 0.003884, Adjusted R-squared: 0.002728
F-statistic: 3.361 on 1 and 862 DF, p-value: 0.06711
[1] 0.003883758 0.067108198
simple.regression(x=data_only_lasso$number_loci_chosen , y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. number of positions chosen by Lasso", xlab="Number of positions chosen bvy lasso",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. number of positions chosen by Lasso"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256035 -0.000147 0.000723 0.001433 0.002462
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.974e-01 6.142e-04 1624.015 <2e-16 ***
x 5.179e-06 2.935e-06 1.765 0.078 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.009165 on 862 degrees of freedom
Multiple R-squared: 0.0036, Adjusted R-squared: 0.002444
F-statistic: 3.115 on 1 and 862 DF, p-value: 0.07795
[1] 0.003600139 0.077950832
simple.regression(x=data_only_lasso$n_seq , y=data_only_lasso$lasso_test_R.2, main= "Lasso test R^2 vs. number of sequences", xlab="Number of sequences",ylab="lasso_test_R.2")
[1] "Lasso test R^2 vs. number of sequences"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.256508 0.000087 0.000855 0.001281 0.001896
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.981e-01 3.792e-04 2631.810 <2e-16 ***
x 8.589e-06 5.733e-06 1.498 0.134
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.00917 on 862 degrees of freedom
Multiple R-squared: 0.002598, Adjusted R-squared: 0.00144
F-statistic: 2.245 on 1 and 862 DF, p-value: 0.1344
[1] 0.002597509 0.134423059
ggplot(data=data_only_lasso,aes(x=n_loci,y=sample_pct))+
geom_point()+xlab("Number of positions") + ylab("Fraction of popsitions chosen by Lasso")+ggtitle("Fraction of positions chosen by Lasso vs. number of positions")+theme(axis.text=element_text(size=12),
axis.title=element_text(size=14))+geom_smooth(se = FALSE)
NA
NA
NA
NA
NA
NA
NA
NA
NA
Effect on the SPR search on the example MSA: Many starting trees on one chosen dataset
example_MSA_data = data%>%filter(dataset_id=="/groups/pupko/noaeker/data/ABC_DR/Selectome/Euteleostomi/ENSGT00600000084481/ref_msa.aa.phy")
print(select(example_MSA_data,dataset_id,job_id, n_loci,number_loci_chosen,n_seq,sample_pct,alpha))
print("Delta ll summary:")
[1] "Delta ll summary:"
print(summary(example_MSA_data$delta_ll_first_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.006565 0.940051 0.943887 0.756157 0.944686 0.945596
print("Rho summary:")
[1] "Rho summary:"
print(summary(example_MSA_data$rho_first_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
17.14 19.48 19.64 19.82 21.42 21.42
plot.hist(example_MSA_data$delta_ll_first_phase, main="Distribution of delta log likelihood values between naive SPR and its lasso based variant", xlab="Delta log likelihood", title_size=12, text_size=11)
best_tree = max(example_MSA_data$overall_best_topology_first_phase ) # removing duplicates
cat("best tree is :",best_tree,"\n") #lasso got it xxx times, naive got it yyy times
best tree is : (0002:0.0140625,(0031:0.1,(0032:0.05,0028:0.1)N5:0.00625)N2:0.0125,(0003:0.1,(((0011:0.05,0006:0.1)N12:0.025,(0010:0.1,0008:0.1)N13:0.05)N10:0.2,((0012:0.2,0017:0.6)N14:0.0125,((0023:0.1,0022:0.20625)N22:0.05,(0018:0.05,0019:0.2)N23:0.0125)N15:0.025)N11:0.00625)N7:0.00625)N3:0.00703125);
lasso_cnt_best_tree=sum(example_MSA_data$rf_first_phase_vs_overall_best==0)
naive_cnt_best_tree=sum(example_MSA_data$rf_naive_vs_overall_best_first_phase==0)
cat("Lasso got best tree",lasso_cnt_best_tree,"times\n")
Lasso got best tree 1 times
cat("Naive got best tree",naive_cnt_best_tree,"times\n")
Naive got best tree 5 times
avg_rf_best_tree_lasso= mean(example_MSA_data$rf_first_phase_vs_overall_best)
avg_rf_best_tree_naive=mean(example_MSA_data$rf_naive_vs_overall_best_first_phase)
cat("The average RF between best tree and lasso is:",avg_rf_best_tree_lasso,"\n")
The average RF between best tree and lasso is: 0.0666664
cat("The average RF between best tree and naive is:", avg_rf_best_tree_naive,"\n")
The average RF between best tree and naive is: 0
average_spr_moves_naive = mean(example_MSA_data$naive_SPR_spr_moves)
average_spr_moves_lasso_first_phase = mean(example_MSA_data$lasso_SPR_first_phase_spr_moves)
cat("Average number of SPR steps in naive SPR is:", average_spr_moves_naive,"\n")
Average number of SPR steps in naive SPR is: 9.2
cat("Average number of SPR steps in lasso first phase SPR variant is:", average_spr_moves_lasso_first_phase,"\n" )
Average number of SPR steps in lasso first phase SPR variant is: 10
plot.hist(example_MSA_data$rho_first_phase,main="Distribution of the running time decrease factor",xlab=" Running time decrease factor")
Effect on the SPR search on 48 datasets:
print("Summary of R squared during tree search:")
[1] "Summary of R squared during tree search:"
print(summary(data$ll_pearson_during_tree_search^2))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.9965 0.9995 0.9997 0.9996 0.9998 1.0000
print("Summary of delta ll:")
[1] "Summary of delta ll:"
print(summary(data$delta_ll_first_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-8.243620 0.000902 0.066764 1.661029 2.633477 14.172631
print("Summary: Standard SPR spr moves:")
[1] "Summary: Standard SPR spr moves:"
print(summary(data$naive_SPR_spr_moves))
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.000 8.000 9.000 9.117 10.000 13.000
print("Summary: Lasso SPR spr moves")
[1] "Summary: Lasso SPR spr moves"
print(summary(data$lasso_SPR_first_phase_spr_moves))
Min. 1st Qu. Median Mean 3rd Qu. Max.
6.000 9.000 10.000 9.771 11.000 13.000
print("Summary: rho")
[1] "Summary: rho"
print(summary(data$rho_first_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.247 7.663 10.464 11.990 14.602 35.614
print("Summary:RF, lasso vs.overall best")
[1] "Summary:RF, lasso vs.overall best"
print(summary(data$rf_first_phase_vs_overall_best))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.08333 0.08229 0.08333 0.41667
print("Summary:RF, naive vs.overall best")
[1] "Summary:RF, naive vs.overall best"
print(summary(data$rf_naive_vs_overall_best_first_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01458 0.00000 0.33333
plot.hist(data$ll_pearson_during_tree_search^2, main = "Distribution R^2 during tree search", xlab="R^2 during tree search")
two.groups.hist(data$naive_SPR_spr_moves, data$lasso_SPR_first_phase_spr_moves,"Standard SPR spr moves","Lasso-based SPR spr moves","Number of spr moves in standard SPR in the Lasso-based variant","Distribution of number of spr moves",title.size=12)
two.groups.hist(data$rf_first_phase_vs_overall_bes, data$rf_naive_vs_overall_best_first_phase,"Lasso-based SPR","Standard SPR","Robinson- Foulds relative distance from best tree","Robinson-Foulds relative distance",title.size=12)
plot.hist(data$rho_first_phase,main="Distribution of running time decrease factor",xlab="Running time decrease factor")
plot.hist(data$delta_ll_first_phase ,main="Distribution of delta log likelihood values",xlab="delta ll")
#plot.hist(data$mistake_cnt,"Histogram of mistakes count per each MSA and starting tree", xlab="mistake cnt")
simple.regression(x=data$ll_pearson_during_tree_search^2,y=data$lasso_test_R.2,main="Lasso test R^2 vs R^2 during tree search",xlab="R^2 during tree search",ylab="Lasso test R^2")
[1] "Lasso test R^2 vs R^2 during tree search"
Call:
lm(formula = (y) ~ x)
Residuals:
Min 1Q Median 3Q Max
-0.0116057 -0.0000718 0.0001014 0.0003312 0.0060257
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.8798 0.2757 -3.192 0.00161 **
x 1.8797 0.2758 6.816 7.61e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.001767 on 238 degrees of freedom
Multiple R-squared: 0.1633, Adjusted R-squared: 0.1598
F-statistic: 46.46 on 1 and 238 DF, p-value: 7.61e-11
[1] 1.633361e-01 7.610378e-11
#same on aggregated data:
#plot.hist(agg_per_msa$ll_pearson_during_tree_search^2, main = "Histogram of R^2 during tree search (aggregated)", xlab="R^2 during tree #search")
#plot.hist(agg_per_msa$naive_SPR_spr_moves, main = "Histogram of SPR naive moves across different MSAs and starting trees (aggregated)", #xlab="SPR moves")
#plot.hist(agg_per_msa$rho_first_phase,main="Histogram of rho across different MSAs and starting trees (aggregated)",xlab="rho")
#plot.hist(agg_per_msa$delta_ll_first_phase ,main="Histogram of delta likelihood values across different MSAs and starting trees #(aggregated)",xlab="rho")
#simple.regression(x=agg_per_msa$lasso_test_R.2,y=agg_per_msa$ll_pearson_during_tree_search^2,main="Lasso test R^2 vs R^2 during tree search #(aggregated)",xlab="Lasso test R^2",ylab="R^2 during tree search")
Effect on the SPR search on the example MSA: MODIFIED SPR SEARCH (including second phase)
cat("Average value of running time decrease factor is:",mean(example_MSA_data$rho_second_phase),"\n")
Average value of running time decrease factor is: 9.374455
plot.hist(example_MSA_data$delta_ll_second_phase , main="Distribution of delta log likelihood values after the second phase", xlab="Delta log likelihood", title_size=12, text_size=11)
best_tree = max(example_MSA_data$overall_best_topology_second_phase) # removing duplicates
cat("best tree is :",best_tree,"\n") #lasso got it xxx times, naive got it yyy times
best tree is : (0012:0.2,0017:0.1,(((0018:0.05,0019:0.2)N6:0.125,(0022:0.05,0023:0.1)N7:0.025)N4:0.1,((0003:0.05,(0002:0.05,(0031:0.05625,(0028:0.05,0032:0.1)N25:0.00625)N19:0.028125)N15:0.025)N8:0.025,((0006:0.2,0011:0.125)N16:0.00625,(0010:0.1,0008:0.2)N17:0.3)N9:0.003125)N5:0.025)N3:0.1);
lasso_cnt_best_tree=sum(example_MSA_data$rf_second_phase_vs_overall_best ==0)
naive_cnt_best_tree=sum(example_MSA_data$rf_naive_vs_overall_best_second_phase==0)
cat("Lasso got best tree",lasso_cnt_best_tree,"times\n")
Lasso got best tree 5 times
cat("Naive got best tree",naive_cnt_best_tree,"times\n")
Naive got best tree 5 times
avg_rf_best_tree_lasso= mean(example_MSA_data$rf_second_phase_vs_overall_best)
avg_rf_best_tree_naive=mean(example_MSA_data$rf_naive_vs_overall_best_second_phase)
cat("The average RF between best tree and lasso is:",avg_rf_best_tree_lasso,"\n")
The average RF between best tree and lasso is: 0
cat("The average RF between best tree and naive is:", avg_rf_best_tree_naive,"\n")
The average RF between best tree and naive is: 0
average_spr_moves_naive = mean(example_MSA_data$naive_SPR_spr_moves)
average_spr_moves_lasso_first_phase = mean(example_MSA_data$lasso_SPR_first_phase_spr_moves)
average_spr_moves_lasso_seond_phase = mean(example_MSA_data$lasso_SPR_second_phase_spr_moves)
cat("Average number of SPR steps in naive SPR is:", average_spr_moves_naive,"\n")
Average number of SPR steps in naive SPR is: 9.2
cat("Average number of SPR steps in lasso first phase SPR variant is:", average_spr_moves_lasso_first_phase,"\n" )
Average number of SPR steps in lasso first phase SPR variant is: 10
cat("Average number of SPR steps in lasso second phase SPR variant is:", average_spr_moves_lasso_seond_phase,"\n" )
Average number of SPR steps in lasso second phase SPR variant is: 0.8
plot.hist(example_MSA_data$rho_second_phase ,main="Distribution of running time decrease factor using the modified Lasso SPR search",xlab="running time decrease factor using the modified Lasso SPR search")
plot.hist(example_MSA_data$lasso_SPR_second_phase_spr_moves, main="Histogram of number of SPR moves during the Lasso second phase",xlab="Spr moves")
print(example_MSA_data$lasso_SPR_second_phase_spr_moves)
[1] 1 0 1 1 1
Effect on the SPR search on the example MSA on 48 datasets: MODIFIED SPR SEARCH (including second phase)
print("Summary of number of steps during the second SPR")
[1] "Summary of number of steps during the second SPR"
print(summary(data$lasso_SPR_second_phase_spr_moves))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 0.675 1.000 3.000
print("Summary of running time decrease factor :")
[1] "Summary of running time decrease factor :"
print(summary(data$rho_second_phase ))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.507 4.335 6.292 7.522 8.862 21.423
print("Summary of RF dist Lasso vs. best second phase:")
[1] "Summary of RF dist Lasso vs. best second phase:"
print(summary(data$rf_second_phase_vs_overall_best))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01285 0.00000 0.33333
print("Summary of RF dist Standard vs. best second phase:")
[1] "Summary of RF dist Standard vs. best second phase:"
print(summary(data$rf_naive_vs_overall_best_second_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.01806 0.00000 0.33333
print("Summary of delta log likelihood")
[1] "Summary of delta log likelihood"
print(summary(data$delta_ll_second_phase))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-8.245300 -0.000029 0.000003 -0.041377 0.000038 1.552383
two.groups.hist(data$rf_second_phase_vs_overall_best, data$rf_naive_vs_overall_best_second_phase,"Lasso-based modified SPR","Standard SPR","Distribution of Robinson-Foulds relative distance from best tree","Robinson Foulds relative distance",title.size=12)
plot.hist(data$lasso_SPR_second_phase_spr_moves , main = "Distribution of number of spr moves during second phase", xlab="SPR moves")
plot.hist(data$rho_second_phase ,main="Distribution of running time decrease factor of modified Lasso-based SPR ",xlab="Running time efficiency factor of Lasso-based SPR")
plot.hist(data$delta_ll_second_phase ,main="Distribution of delta log likelihood values after the second phase",xlab="delta ll (second phase)")
#same on aggregated data:
#plot.hist(agg_per_msa$lasso_SPR_second_phase_spr_moves , main = "Histogram of Lasso second phase spr moves across different MSAs #and starting trees (aggregated)", xlab="SPR moves",title_size = 10)
#plot.hist(agg_per_msa$rho_second_phase ,main="Histogram of rho (aggregated second phase) across different MSAs and starting #trees",xlab="rho (second_phase)",title_size = 10)
#plot.hist(agg_per_msa$delta_ll_second_phase ,main="Histogram of delta likelihood (aggregated second phase) values across #different MSAs and starting trees",xlab="delta ll (second phase)",title_size = 10)
Visualizing fitted regression line of rho~ n_loci per dataset
library(visreg)
lm = lm(log(rho_second_phase)~n_loci+naive_SPR_spr_moves+dataset_id , data = data)
visreg(lm, "naive_SPR_spr_moves", by=("dataset_id") ,overlay =TRUE, legend = FALSE)
Generating a linear mixed model
#centering predictors
plot(example_MSA_data$naive_SPR_spr_moves, example_MSA_data$rho_second_phase)
plot(data$naive_SPR_spr_moves, data$rho_second_phase)
data$n_loci_centered_per_1000 = (data$n_loci-mean(data$n_loci))/1000
data$avg_entropy_centered = data$avg_entropy-mean(data$avg_entropy)
data$naive_SPR_spr_moves_centered = data$naive_SPR_spr_moves -mean(data$naive_SPR_spr_moves)
#mixed model
rho_lmer = lmer(log(rho_second_phase)~(1|dataset_id)+n_loci_centered_per_1000+naive_SPR_spr_moves_centered+data$avg_entropy_centered,data)
print(summary(rho_lmer))
Linear mixed model fit by REML ['lmerMod']
Formula: log(rho_second_phase) ~ (1 | dataset_id) + n_loci_centered_per_1000 +
naive_SPR_spr_moves_centered + data$avg_entropy_centered
Data: data
REML criterion at convergence: 53.5
Scaled residuals:
Min 1Q Median 3Q Max
-5.2389 -0.2725 0.0067 0.2777 5.5309
Random effects:
Groups Name Variance Std.Dev.
dataset_id (Intercept) 0.21319 0.4617
Residual 0.03543 0.1882
Number of obs: 240, groups: dataset_id, 48
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.87752 0.06774 27.715
n_loci_centered_per_1000 0.28015 0.09910 2.827
naive_SPR_spr_moves_centered 0.05568 0.01168 4.767
data$avg_entropy_centered -0.52302 0.44905 -1.165
Correlation of Fixed Effects:
(Intr) n____1 n_SPR_
n_lc___1000 0.000
nv_SPR_sp__ 0.000 0.004
dt$vg_ntrp_ 0.000 -0.027 -0.061
qqnorm(ranef(rho_lmer)$dataset_id[, "(Intercept)"], main = "Random effects")
abline(a=0,b=1)
plot((rho_lmer))
NA
NA
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.